library(quanteda)
library(dplyr)
library(lubridate)
library(ggplot2)
library(tidyr)
library(readxl)

# --------------------------------
# load data
# --------------------------------

# granier data
granier_data <- readRDS("data/outputs/granier_data.rds")
granier_data_sub <- granier_data %>% filter(text != '', speaker %in% c("Hugo Chavez", "Henrique Salas Romer"))
corpus_granier <- corpus(granier_data_sub$text, docvars = data.frame(speaker = granier_data_sub$speaker)) # quanteda corpus

# load coded NNs
nns_pobreza <- read_excel("data/outputs/nns_pobreza_all.xlsx")
nns_constituyente <- read_excel("data/outputs/nns_constituyente_all.xlsx")

# --------------------------------
# build dictionary
# --------------------------------
dict <- dictionary(list(constituyente = nns_constituyente$node, pobreza = nns_pobreza$node)) # top 20 terms

# --------------------------------
# count dictionary terms in corpus
# --------------------------------

dfm_granier <-  dfm_lookup(dfm(tokens(corpus_granier), tolower = TRUE), dictionary = dict)
dict_counts <- dfm_granier %>% convert("data.frame") %>% 
  mutate(num_tokens = unname(ntoken(corpus_granier)), speaker = docvars(corpus_granier, "speaker")) %>%
  group_by(speaker) %>%
  summarize(num_tokens = sum(num_tokens),
            constituyente = sum(constituyente)/num_tokens,                        
            pobreza = sum(pobreza)/num_tokens, .groups = 'drop_last') %>% select(-num_tokens) %>% 
  pivot_longer(cols = c(constituyente, pobreza), names_to = "theme")

# --------------------------------
# table 1
# --------------------------------
dict_counts

